package com.cmge.ad.spider.pic.meinv;

import java.util.ArrayList;
import java.util.List;

import org.springframework.util.StringUtils;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import com.cmge.ad.model.Album;
import com.cmge.ad.model.Picture;
import com.cmge.ad.spider.pipeline.MysqlAlbumPicturePipeline;

/**
 * @desc	八零九零抓取   已爬
 * 
 * 			http://www.8090.com/albumlist
 * 			http://www.8090.com/pages/album_list.aspx?page=1620
 * 
 * @author	ljt
 * @time	2014-12-30 下午7:51:34
 */
public class BaLinJiuLinImagCrawl implements PageProcessor {
	
    private Site site = Site.me().setRetryTimes(3).setSleepTime(100);

    public static final String URL_LIST = "album_list.aspx";
    
    // 列表最大值
    private int max = 1620;
    
    @Override
    public void process(Page page) {
    	if (page.getUrl().regex(URL_LIST).match()) {
    		// 检索当前页面所有段子
        	List<Selectable> cList = page.getHtml().xpath("//dd[@class='block']//div[@class='pic']//a[@target='_blank']/@href").nodes();
    		if(null != cList && cList.size() > 0){
    			for(Selectable str : cList){
    				String url = str.get().toString();
    				if(!StringUtils.isEmpty(url)){
    					page.addTargetRequest(url);
    				}
    			}
    		}
    		
    		// 当前页
    		String url = page.getUrl().get();
    		int current = 1;
    		try {
    			current = Integer.parseInt(url.substring(url.lastIndexOf("=")+1,url.length()));
    		} catch (Exception e) {
    			e.printStackTrace();
    		}
    		
    		System.out.println("current is "+current);
    		if(current < max){
    			page.addTargetRequest("http://www.8090.com/pages/album_list.aspx?page="+(current+1));
    		}
        } else {
        	List<Selectable> cList = page.getHtml().xpath("//ul[@class='images']//li//img/@data-src").nodes();
        	if(null != cList && cList.size() > 0){
        		if(cList.size() == 1){
        			String url = cList.get(0).get();
        			if(!StringUtils.isEmpty(url)){
        				// 单张图片  直接插入Picture
        				Picture pic = new Picture();
        				pic.setDesc("");
        				pic.setMinImageUrl(url);
        				pic.setMaxImageUrl(url);
        				pic.setSource("balingjiuling_meinv");
        				page.putField("picture",pic);
        				page.putField("type",1);
        			}
        		}else{
        			Album album = new Album();
        			List<Picture> picList = new ArrayList<Picture>();
        			for(Selectable sel : cList){
        				String url = sel.get();
            			if(!StringUtils.isEmpty(url)){
            				Picture pic = new Picture();
            				pic.setDesc("");
            				pic.setMinImageUrl(url);
            				pic.setMaxImageUrl(url);
            				pic.setSource("balingjiuling_meinv");
            				picList.add(pic);
            			}
        			}
        			if(picList.size() >= 1){
        				album.setPictureList(picList);
        				album.setAlbumUrl(picList.get(0).getMinImageUrl());
        				page.putField("album",album);
        				page.putField("type",2);
        			}
        		}
        	}
        }
    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) throws Exception {
    	Spider qsSpider = Spider.create(new BaLinJiuLinImagCrawl())
    					.addUrl("http://www.8090.com/pages/album_list.aspx?page=1")
//    					.addPipeline(new RedisPipeline())
//    					.addPipeline(new JsonFilePipeline())
//    					.addPipeline(new JsonPipeline())
    					.addPipeline(new MysqlAlbumPicturePipeline())
    					.thread(1);
    	qsSpider.start();
    }	
	
}
